#delimit ;
cap log close ;
log using makedata.log, text replace ;

clear ;
set matsize 2000 ;

do set_directory_macros ;

use "${rawdat}\cps_00003" , replace ;

summ ;

/* states uniquely defined only from 1977 on */
keep if year >= 1977 ;

/* drop missing values */

foreach vv in educ age sex year state incwage { ;

	di "`vv'" ;
	drop if `vv' == . ;

} ;

/* drop if earnings allocated */
drop if qincwage ~= 0 ;

/* turn education into years */

gen yrseduc = . ;
replace yrseduc = 0 if educ == 2 ;
replace yrseduc = 3 if educ == 10 ;
replace yrseduc = 1 if educ == 11 ;
replace yrseduc = 2 if educ == 12 ;
replace yrseduc = 3 if educ == 13 ;
replace yrseduc = 4 if educ == 14 ;

replace yrseduc = 6 if educ == 20 ;
replace yrseduc = 5 if educ == 21 ;
replace yrseduc = 6 if educ == 22 ;

replace yrseduc = 8 if educ == 30 ;
replace yrseduc = 7 if educ == 31 ;
replace yrseduc = 8 if educ == 32 ;

replace yrseduc = 9 if educ == 40 ;
replace yrseduc = 10 if educ == 50 ;
replace yrseduc = 11 if educ == 60 ;

replace yrseduc = 11 if educ == 71 ;
replace yrseduc = 12 if educ == 72 ;
replace yrseduc = 12 if educ == 73 ;

replace yrseduc = 13 if educ == 80 ;
replace yrseduc = 14 if educ == 81 ;

replace yrseduc = 14 if educ == 90 ;
replace yrseduc = 14 if educ == 91 ;
replace yrseduc = 14 if educ == 92 ;

replace yrseduc = 15 if educ == 100 ;
replace yrseduc = 16 if educ == 110 ;
replace yrseduc = 16 if educ == 111 ;

replace yrseduc = 16 if educ == 121 ;
replace yrseduc = 16 if educ == 122 ;
replace yrseduc = 18 if educ == 123 ;
replace yrseduc = 19 if educ == 124 ;
replace yrseduc = 20 if educ == 125 ;

drop if yrseduc == 999 ;

/* keep workers with sufficient labor force attachment */
keep if WKSWORK2 >= 4 ; /* 40 or more weeks worked in last year */
keep if uhrswork >= 30 ; /* 30 or more usual hours per week worked in last year */
drop if incwage == 0 ;

/* generate ln(earnings/hour) */
gen numweeks = 43.5 if WKSWORK2 == 4 ;
replace numweeks = 48.5 if WKSWORK2 == 5 ;
replace numweeks = 50 if WKSWORK2 == 6 ;
gen annual_hours = numweeks * uhrswork ;

gen incwage1999 = incwage * CPI99 ;
gen wage_per_hour = incwage1999 / annual_hours ;
summ wage_per_hour , det ;
drop if wage_per_hour < 2 | wage_per_hour > 100 ;


gen lnwage = ln(wage_per_hour) ;
*graph twoway hist lnwage ;

gen age2 = age*age ;

/* keep only variables of interest */
keep year statefip wtsupp age age2 sex yrseduc lnwage wage_per_hour ;

/* Feb 19, 2014:  doug thinks these two lines can be cut:
reg lnwage age age2 sex yrseduc i.year i.statefip ;
predict resid , resid ;
*/

summ ;


save "${madedat}\CPS_all_micro" , replace ;

/* save 2012 analysis data file for Tables 1 and 2 */
keep if year == 2012 ;
summ ;
save "${madedat}\CPS_2012_micro" , replace ;

/* now make a medium subsample */
set seed 10101 ;
qui keep if uniform() < 0.20 ;
save "${madedat}\CPS_2012_micro_medium" , replace ;

/* now make a small subsample.  15% of 20% is 3% of original */
set seed 10102 ;
qui keep if uniform() < 0.15 ;
save "${madedat}\CPS_2012_micro_small" , replace ;



/* get state-year panel ready */
use "${madedat}\CPS_all_micro" , replace ;

/* partial out differences in demographics across state years.  So, get regression on */
egen styr = group(statefip year) ;
areg lnwage age age2 sex yrseduc [pw=wtsupp] , a(styr) ;
predict lnwage_sy , d ; /* uses the estimated value of the fixed effect for prediction, outcome_sy_hat */


collapse (mean) lnwage lnwage_sy (rawsum) wtsupp [pw=wtsupp] , by(statefip year) ;
rename wtsupp popweight ;
summ ;
save "${madedat}\CPS_panel" , replace ;

